In [1]:
import pandas as pd
from datetime import timedelta
# ****************** Program Settings ******************
Folder = "" # Location of program scripts
Data = "temp/" # Location to which temporary files are generated
DataSource = "data/" # Location of the original data files (ASCII)
Gap_Days = 60 # To be quantify as a gap, the holidays that a fund manager takes must be at least 60 days.
# If you would like to allow overlapping funds, change this to a negative number.
# ******************************************************
In [2]:
# Load manager details
data_manager = pd.read_stata(DataSource + 'PeopleDetails.dta')
data_manager = data_manager[data_manager.PersonTypeID==1]
variables_to_keep = "ProductReference PersonID First Last JobTitle Address1 Address2 Address3 CityName StateName Zip CountryName".split()
data_manager = data_manager[variables_to_keep]
data_manager.head()
Out[2]:
In [3]:
# Extract Inception / PerformnaceEndData date
data_dates = pd.read_stata(DataSource + 'ProductDetails.dta')
variables_to_keep = "ProductReference InceptionDate PerformanceEndDate".split()
data_dates = data_dates[variables_to_keep]
data_dates.head()
Out[3]:
Merge managers with the start/end dates
In [4]:
# Inner join - non-matches are excluded
data_merged = data_manager.merge(data_dates, on='ProductReference', how='inner')
data_merged = data_merged.sort(columns=['PersonID',
'PerformanceEndDate',
'InceptionDate',
'ProductReference'])
data_merged.head()
Out[4]:
Find First End Date for each PersonID. Variable first_end_date is defined as the performance ending date of the first fund of each person ID.
In [5]:
# Create a GroupBy object
grouped = data_merged[['PersonID', 'PerformanceEndDate']].groupby('PersonID', as_index=False, axis=0) # these will aplit the DataFrame on its index (rows).
grouped.groups
print(grouped.get_group(4))
grouped.min().head()
Out[5]:
In [6]:
# Find the smallest value in the end date for each PersonID
transformed = grouped.min()
transformed['first_end_date'] = transformed['PerformanceEndDate']
transformed = transformed.drop('PerformanceEndDate', axis=1)
transformed.head()
Out[6]:
In [ ]:
In [7]:
# Merge back to the main dataset
data_main = data_merged.merge(transformed, how='outer',
left_on='PersonID', right_on='PersonID')
data_main = data_main.sort(columns=['PersonID',
'InceptionDate',
'PerformanceEndDate',
'ProductReference'])
################################################
# WARNING: DEBUG CODE -- NEEDS TO BE DISABLED
# data_main = data_main[:2000:]
# data_main = data_main[data_main.PersonID==799]
################################################
def find_gaps(person_panel):
'''The function finds the number of relaunched hedge funds.
A gap is defined as there exists a fund, of which the PerformanceEndDate
is before the inception date of all funds that were incepted later than
this fund.
Returns the number of gaps; and all of the Fund ID which is proceeding
each gap.
'''
gaps_number = 0
fund_ID_preceed_gap = []
# Reset index from 0
person_panel = person_panel.reset_index(drop=True)
# print("Her data panel is as below:")
# print(person_panel)
for i, i_row in person_panel.iterrows():
# Reset criteria status for a new i_row
criteria_backward = True
criteria_forward = True
criteria_exist_after = False
for j, j_row in person_panel.iterrows():
if i==j:
# Skip
continue
# print('Comparison now made for row: ', i, j)
# Days to define the gap is NOT yet incorporated.
# Criteria 1 - Looking backward: check if i_end>j_end for all j of which j_inc<i_inc
# For all funds earlier than i
# j: Inc-----End
# i: Inc------------End
if j_row.InceptionDate<i_row.InceptionDate:
criteria_backward *= check_backward(i_row, j_row)
# Criteria 2 - Looking forward: check if i_end<j_inc for all j of which j_inc>i_inc
# For all funds later than i
# i: Inc---------End
# |***GAP***|
# j: Inc--------End
if j_row.InceptionDate>=i_row.InceptionDate:
criteria_forward *= check_forward(i_row, j_row)
# Criteria 3 - There must be funds incepted after fund i
if j_row.InceptionDate>=i_row.InceptionDate:
criteria_exist_after = True
# If Criteria 1,2,3 are all satisfied
# Fund i is the fund proceeding a gap
if criteria_backward==True and criteria_forward==True and criteria_exist_after==True:
fund_ID_preceed_gap.append(i_row.ProductReference)
gaps_number += 1
return (gaps_number, fund_ID_preceed_gap)
def check_backward(i_row, j_row):
"""
Compares the PerformanceEndDate for i_row and j_row.
It returns True if i_end>j_end;
It returns False otherwise.
"""
if i_row.PerformanceEndDate >= j_row.PerformanceEndDate:
return True
else:
return False
def check_forward(i_row, j_row):
"""
Compares the PerformanceEndDate for i_row to the InceptionDate of j_row.
Returns True if i_end<j_inc;
Returns False otherwise.
"""
global Gap_Days # Find the global variable Gap_Days
gap_days = timedelta(days=Gap_Days)
if i_row.PerformanceEndDate + gap_days <= j_row.InceptionDate:
return True
else:
return False
grouped_by_person = data_main.groupby('PersonID', as_index=False, axis=0)
grouped_by_person.groups
# A new dict to store number of gaps found for each person
gaps = {}
fund_IDs = {}
for person_id, person_panel in grouped_by_person:
# print("The person's PersonID is :", person_id)
gaps_number, fund_ID_preceed_gap = find_gaps(person_panel[['ProductReference',
'InceptionDate',
'PerformanceEndDate']])
gaps[person_id] = gaps_number
fund_IDs[person_id] = fund_ID_preceed_gap
In [18]:
# Transform two dict into DataFrame
data_gaps = pd.DataFrame.from_dict(gaps, orient='index')
data_gaps.columns = ['number_of_gaps']
data_gaps['PersonID'] = data_gaps.index
data_gaps = data_gaps.reset_index(drop=True)
data_gaps.head()
Out[18]:
In [20]:
data_gap_fund_IDs = pd.DataFrame.from_dict(fund_IDs, orient='index')
data_gap_fund_IDs.columns = ['fund_IDs_proceeding_1st_gap', 'fund_IDs_proceeding_2nd_gap']
data_gap_fund_IDs['PersonID'] = data_gap_fund_IDs.index
data_gap_fund_IDs = data_gap_fund_IDs.reset_index(drop=True)
data_gap_fund_IDs.head()
Out[20]:
In [21]:
# Merge with the main dataset
data_output = data_main.merge(data_gaps, how='outer',
left_on='PersonID', right_on='PersonID')
data_output = data_output.merge(data_gap_fund_IDs, how='outer',
left_on='PersonID', right_on='PersonID')
data_output.head()
Out[21]:
In [22]:
# Output datafiles
data_output.to_excel('data_output.xlsx')
data_output.to_stata('data_output.dta', convert_dates={13:'td', 14:'td', 15:'td'})